# Creates output to be passed along to rest of model
library(pacman)
p_load(
  here, data.table, fst, purrr, lubridate, dplyr, stringr
)

# First the model coefficients table
good_model_dt = 
  read.fst(
    path = here("Data/electricity-generation/unit-model-fit/good-model-dt.fst"),
    as.data.table = TRUE
  )[,.(
    orispl_code, unitid, 
    intercept,
    excess_load_cal,excess_load_cal_sq,
    excess_load_mro,excess_load_mro_sq,
    excess_load_npcc,excess_load_npcc_sq,
    excess_load_rfc,excess_load_rfc_sq,
    excess_load_serc,excess_load_serc_sq,
    excess_load_tre,excess_load_tre_sq,
    excess_load_wecc,excess_load_wecc_sq,
    log_scale, est_extremum,
    loglik, loglik_df
  )] |>
  setkey(orispl_code, unitid) 
# Writing the results as a csv
fwrite(
  x = good_model_dt,
  file = here("Data/electricity-generation/output/PPRegressors.csv")
)

# Unit information table 
unit_info_dt = read.fst(
  here("Data/electricity-generation/unit-info-dt.fst"),
  as.data.table = TRUE
) |>
  setkey(orispl_code, unitid, year)

fwrite(
  x = unit_info_dt[,.(orispl_code, unitid, namepcap)],
  file = here("Data/electricity-generation/output/PPCap.csv")
)
# Stack heights
fwrite(
  x = unit_info_dt[,.(orispl_code, unitid, stack_height)],
  file = here("Data/electricity-generation/output/StackHeight.csv")
)


# Load table 
nerc_load_dt_raw = 
  read.fst(
    path = here("Data/electricity-generation/clean-load/nerc-load-dt.fst"),
    as.data.table = TRUE
  )[!is.na(nerc_adj)] |>
  setkey(nerc_adj, utc_time)

# Reading the electricity generation files (with fitted values)
elec_gen_dt = 
  read.fst(
    path = here("Data/electricity-generation/unit-model-fit/elec-gen-fit-dt.fst"),
    as.data.table = TRUE
  ) |>
  merge(
    unit_info_dt,
    by = c('orispl_code','unitid')
  )

# Summarizing production by region
nerc_gen_dt = 
  elec_gen_dt[,.(
    tot_gload_mwh = sum(gload_mwh),
    tot_fit_gload = sum(fit_gload)), 
    keyby = .(nerc_adj, utc_time)
  ]

# Adding gen data to load data
nerc_load_dt = 
  merge(
    nerc_gen_dt,
    nerc_load_dt_raw, 
    by = c("nerc_adj","utc_time"),
    all.x = TRUE
  )

# Renewables 
fwrite(
  x = nerc_load_dt[,.(
    nerc_adj, utc_time, 
    load_nd, 
    load_solar, 
    load_wind = load_nd - load_solar,
    load_hydro
  )],
  file = here("Data/electricity-generation/output/RegRenew.csv")
)

# Total Load
fwrite(
  x = nerc_load_dt[,.(
    nerc_adj, utc_time, 
    excess_load, 
    excess_load_adj = excess_load - load_ff + tot_gload_mwh
  )],
  file = here("Data/electricity-generation/output/RegLoad.csv")
)



# Now loading coefficients for own region only 
unit_mod_coef_dt =
  read.fst(
    path = here("Data/electricity-generation/unit-model-fit/unit-mod-coef-dt.fst"),
    as.data.table = TRUE
  )
max_nerc_load_dt = 
  read.fst(
    path = here("Data/electricity-generation/clean-load/nerc-load-dt.fst"),
    as.data.table = TRUE
  )[!is.na(nerc_adj) & year(utc_time) == 2019,.(
    max_eload = max(excess_load)), 
    keyby = nerc_adj
  ]
# Merging with unit info to get nerc_adj and to max excess load for each region
all_model_dt =
  unit_mod_coef_dt |>
  dcast(
    orispl_code + unitid + fml + loglik + loglik_df ~ rn,
    value.var = "estimate",
    fill = 0
  ) |>
  setnames(
    old = '(Intercept)',
    new = 'intercept'
  ) %>% .[,':='(
    has_sq_term = str_detect(fml, 'sq')
  )] |>
  merge(
    unit_info_dt,
    by = c('orispl_code','unitid')
  ) |>
  merge(
    max_nerc_load_dt,
    by = c("nerc_adj")
  )
# Checking that derivative wrt own region excess load is
# positive at 0 and at max excess load
# Also checking that all coefs are postive
all_model_dt[,':='(
  increasing_at_0 = fcase(
    nerc_adj == 'CAL',  excess_load_cal  >= 0,
    nerc_adj == 'MRO',  excess_load_mro  >= 0,
    nerc_adj == 'NPCC', excess_load_npcc >= 0,
    nerc_adj == 'RFC',  excess_load_rfc  >= 0,
    nerc_adj == 'SERC', excess_load_serc >= 0,
    nerc_adj == 'TRE',  excess_load_tre  >= 0,
    nerc_adj == 'WECC', excess_load_wecc >= 0
  ),
  increasing_at_max = fcase(
    nerc_adj == 'CAL',  excess_load_cal  + excess_load_cal_sq*max_eload  >= 0,
    nerc_adj == 'MRO',  excess_load_mro  + excess_load_mro_sq*max_eload  >= 0,
    nerc_adj == 'NPCC', excess_load_npcc + excess_load_npcc_sq*max_eload >= 0,
    nerc_adj == 'RFC',  excess_load_rfc  + excess_load_rfc_sq*max_eload  >= 0,
    nerc_adj == 'SERC', excess_load_serc + excess_load_serc_sq*max_eload >= 0,
    nerc_adj == 'TRE',  excess_load_tre  + excess_load_tre_sq*max_eload  >= 0,
    nerc_adj == 'WECC', excess_load_wecc + excess_load_wecc_sq*max_eload >= 0
  ),
  coefs_positive = 
    excess_load_cal  >= 0 & 
    excess_load_mro  >= 0 & 
    excess_load_npcc >= 0 & 
    excess_load_rfc  >= 0 & 
    excess_load_serc >= 0 & 
    excess_load_tre  >= 0 & 
    excess_load_wecc >= 0 
)][,# Creating indicator for models that fit our restrictions 
   good_model := coefs_positive & increasing_at_0 & increasing_at_max
]
fml_dt_raw = 
  data.table(
    nerc_adj = c(
      'TRE',
      rep('WECC',2),
      rep('CAL', 2),
      rep('MRO', 7),
      rep('NPCC', 7),
      rep('RFC', 7),
      rep('SERC', 7)
    ),
    control_1 = c(
      NA, 
      NA, 'cal', 
      NA, 'wecc', 
      NA, 'npcc','npcc','npcc','rfc','rfc','serc', 
      NA, 'mro','mro','mro','rfc','rfc','serc', 
      NA, 'mro','mro','mro','npcc','npcc','serc', 
      NA, 'mro','mro','mro','npcc','npcc','rfc'
    ),
    control_2 = c(
      NA, 
      NA, NA, 
      NA, NA, 
      NA, 'rfc','rfc','serc','serc', NA, NA, 
      NA, 'rfc','rfc','serc','serc', NA, NA, 
      NA, 'npcc','npcc','serc','serc', NA, NA, 
      NA, 'npcc','npcc','rfc','rfc', NA, NA
    ),
    control_3 = c(
      NA, 
      NA, NA, 
      NA, NA, 
      NA, 'serc',NA, NA, NA, NA, NA, 
      NA, 'serc',NA, NA, NA, NA, NA, 
      NA, 'serc',NA, NA, NA, NA, NA, 
      NA, 'rfc',NA, NA, NA, NA, NA
    ),
    square = FALSE
  )
fml_dt =
  rbind(
    fml_dt_raw[,.(nerc_adj, control_1, control_2, control_3, terms = 'quadratic')],
    fml_dt_raw[,.(nerc_adj, control_1, control_2, control_3, terms = 'linear')],
    fml_dt_raw[
      is.na(control_1) & is.na(control_2) & is.na(control_3),
      .(nerc_adj, control_1, control_2, control_3, terms = 'constant')
    ]
  )[,
    ic_formula := fcase(
      terms == 'linear', 
      paste(
        paste0('gload_mwh ~ excess_load_',tolower(nerc_adj)),
        paste0('excess_load_',control_1),
        paste0('excess_load_',control_2),
        paste0('excess_load_',control_3),
        sep = ' + '
      ),
      terms == 'quadratic', 
      paste(
        paste0('gload_mwh ~ excess_load_',tolower(nerc_adj)),
        paste0('excess_load_',tolower(nerc_adj),'_sq'),
        paste0('excess_load_',control_1),
        paste0('excess_load_',control_2),
        paste0('excess_load_',control_3),
        sep = ' + '
      ),
      terms == 'constant', 'gload_mwh ~ 1'
    ) |> str_remove_all(' \\+ excess_load_NA')
  ][,
    own_region_only := is.na(control_1) & is.na(control_2) & is.na(control_3)
  ] 
# Repeating for a model with just own region load 
own_region_model_dt = 
  merge(
    all_model_dt,
    fml_dt[,.(ic_formula, own_region_only)] |> unique(),
    by.x = 'fml', by.y = 'ic_formula',
  )[own_region_only == TRUE & good_model == TRUE,
    .SD[which.max(loglik)], 
    by = .(orispl_code, unitid)
  ][,.(
    orispl_code, unitid, 
    intercept,
    excess_load_cal,excess_load_cal_sq,
    excess_load_mro,excess_load_mro_sq,
    excess_load_npcc,excess_load_npcc_sq,
    excess_load_rfc,excess_load_rfc_sq,
    excess_load_serc,excess_load_serc_sq,
    excess_load_tre,excess_load_tre_sq,
    excess_load_wecc,excess_load_wecc_sq,
    log_scale, 
    est_extremum = fcase(
      str_detect(fml, 'cal_sq'), -excess_load_cal/(2*excess_load_cal_sq),
      str_detect(fml, 'mro_sq'), -excess_load_mro/(2*excess_load_mro_sq),
      str_detect(fml, 'npcc_sq'), -excess_load_npcc/(2*excess_load_npcc_sq),
      str_detect(fml, 'rfc_sq'), -excess_load_rfc/(2*excess_load_rfc_sq),
      str_detect(fml, 'serc_sq'), -excess_load_serc/(2*excess_load_serc_sq),
      str_detect(fml, 'tre_sq'), -excess_load_tre/(2*excess_load_tre_sq),
      str_detect(fml, 'wecc_sq'), -excess_load_wecc/(2*excess_load_wecc_sq)
    ),
    loglik, loglik_df
  )] |>
    setkey(orispl_code, unitid) 


# Writing the results as a csv
fwrite(
  x = own_region_model_dt,
  file = here("Data/electricity-generation/output/PPRegressors-own-region.csv")
)



# Calculating correlation between regions 
p_load(corrplot)
nerc_load_dt_raw[year(utc_time) == 2019, .(nerc_adj, utc_time, excess_load)] |>
  dcast(utc_time ~ nerc_adj, value.var ='excess_load') %>% 
  .[,-'utc_time'] |> 
  cor() |>
  corrplot(
    type = 'upper',tl.col = "black", tl.srt = 45
  )



